Introduction

This is a draft, the analysis is still on-going.

This document focuses on exploring the relationship between the census variables.

Setup

library(tidyverse)
library(magrittr)
library(knitr)
library(GGally)

Data

Load the transformed census data.

census_data_trans <-
  read_csv(
    "../storage/dati-cpa_2011_all-trans-v0_0_3.csv",
    col_types = paste(c(rep("c", 12), rep("d", 125)), collapse="")
  )

Correlations

Calculate the correlation between the transformed variables to identify those that might be excluded from the analysis.

candidate_vars <- 
  census_data_trans %>% 
  select(P1_norm_log10_std:E30_E31_norm_log10_std) %>% 
  colnames()

candidate_vars_cor <- NA

for (i in 1:(length(candidate_vars) - 1)) {
  for(j in (i + 1):length(candidate_vars)) {
    #cat("Calculating correlation between", candidate_vars[i], "and", candidate_vars[j],"\n")
    census_data_trans_sample <-
      census_data_trans %>% 
      slice_sample(prop = 0.01)
    ij_cor_test <- cor.test(
      census_data_trans_sample %>% pull(candidate_vars[i]), 
      census_data_trans_sample %>% pull(candidate_vars[j]), 
      method = "kendall"
    )
    if(i == 1 & j == 2){
      candidate_vars_cor <-
        tibble(
          var_i = candidate_vars[i],
          var_j = candidate_vars[j],
          estimate = ij_cor_test %$% estimate %>% as.numeric(),
          p_value = ij_cor_test %$% p.value %>% as.numeric()
        )
    } else {
      candidate_vars_cor <-
        candidate_vars_cor %>% 
        add_row(
          var_i = candidate_vars[i],
          var_j = candidate_vars[j],
          estimate = ij_cor_test %$% estimate %>% as.numeric(),
          p_value = ij_cor_test %$% p.value %>% as.numeric()
        )
    }
  }
}

Further explore the most highly correlated variables.

correlations_cutoff_p_value <- 0.01
correlations_cutoff_estimate <- 0.5

candidate_vars_cor %>% 
  filter(
    p_value < correlations_cutoff_p_value & 
    estimate > correlations_cutoff_estimate
  ) %>% 
  kable()
var_i var_j estimate p_value
P1_norm_log10_std A44_norm_log10_std 0.9020632 0
P1_norm_log10_std PF1_norm_log10_std 0.9315714 0
P1_norm_log10_std E1_norm_log10_std 0.5668273 0
P2_norm_std P9_norm_std 0.6052338 0
P2_norm_std P53_norm_std 0.8484472 0
P9_norm_std P53_norm_std 0.5329157 0
P17_norm_log10_std P131_norm_log10_std 0.5514256 0
P29_norm_log10_std P139_norm_std 0.5288989 0
P33_norm_std P132_norm_std 0.5292381 0
P60_norm_std P61_norm_std 0.7663050 0
P64_norm_std P65_norm_std 0.8096169 0
ST1_norm_log10_std ST2_norm_std 0.5584053 0
ST1_norm_log10_std ST3_norm_std 0.5104780 0
ST2_norm_std ST3_norm_std 0.5243366 0
A3_norm_std A5_A6_A7_norm_std 0.9730359 0
A44_norm_log10_std PF1_norm_log10_std 0.9193828 0
A44_norm_log10_std E1_norm_log10_std 0.5692832 0
PF1_norm_log10_std E1_norm_log10_std 0.5693603 0
PF2_norm_std PF6_norm_log10_std 0.5173849 0
E20_norm_std E24_E25_E26_norm_std 0.6064776 0
E20_norm_std E27_norm_log10_std 0.5406838 0
E24_E25_E26_norm_std E27_norm_log10_std 0.7381162 0
correlations_to_explore <-
  c(
    candidate_vars_cor %>% 
      filter(
        p_value < correlations_cutoff_p_value & 
        estimate > correlations_cutoff_estimate
      ) %>% 
      pull(var_i),
    candidate_vars_cor %>% 
      filter(
        p_value < correlations_cutoff_p_value & 
        estimate > correlations_cutoff_estimate
      ) %>% 
      pull(var_j)
  ) %>% 
  unique()

correlations_to_explore_panel <-
  census_data_trans %>%
  slice_sample(prop = 0.01) %>% 
  select({{correlations_to_explore}}) %>%
  ggpairs(
    upper = list(continuous = wrap(ggally_cor, method = "kendall")),
    lower = list(continuous = wrap("points", alpha = 0.3, size=0.1))
  )
print(correlations_to_explore_panel)

# ggsave(
#   "../100-prep/111-classification-variable-selection-top-correlations.png", 
#   correlations_to_explore_panel,
#   width = 600,
#   height = 600,
#   units = "mm",
#   dpi=300
# )

The figure below is an annotated version of the plot above.

Variable code Variable description
P1 Popolazione residente - totale
P2 Popolazione residente - maschi
P53 Popolazione residente - maschi di 6 anni e più
P60 Popolazione residente - totale di 15 anni e più appartenente alle forze di lavoro totale
P61 Popolazione residente - totale di 15 anni e più occupata (FL)
P64 Popolazione residente - maschi di 15 anni e più appartenente alle forze di lavoro
P65 Popolazione residente - maschi di 15 anni e più occupata (FL)
A3 Abitazioni vuote e abitazioni occupate solo da persone non residenti
A5 Altri tipi di alloggio occupati
A6 Abitazioni vuote
A7 Abitazioni occupate solo da persone non residenti
A44 Superficie delle abitazioni occupate da almeno una persona residente
PF1 Famiglie residenti - totale
E24 Edifici ad uso residenziale da 5 a 8 interni
E25 Edifici ad uso residenziale da 9 a 15 interni
E26 Edifici ad uso residenziale con 16 interni o più
E27 Totale interni in edifici ad uso residenziale

Variable selection

Based on the correlations illustrated above:

Save values

census_data_trans_selected <-
  census_data_trans %>% 
  select(
    -A44_norm_log10_std, -PF1_norm_log10_std, -P53_norm_std, 
    -P61_norm_std, -P65_norm_std, -A5_A6_A7_norm_std, 
    -E27_norm_log10_std
  )

colnames(census_data_trans_selected)
##   [1] "CODREG"                            "REGIONE"                          
##   [3] "CODPRO"                            "PROVINCIA"                        
##   [5] "CODCOM"                            "COMUNE"                           
##   [7] "PROCOM"                            "SEZ2011"                          
##   [9] "NSEZ"                              "ACE"                              
##  [11] "CODLOC"                            "CODASC"                           
##  [13] "P1_norm_log10_std"                 "P2_norm_std"                      
##  [15] "P3_norm_std"                       "P4_norm_std"                      
##  [17] "P5_norm_std"                       "P6_norm_log10_std"                
##  [19] "P7_norm_log10_std"                 "P8_norm_log10_std"                
##  [21] "P9_norm_std"                       "P10_norm_std"                     
##  [23] "P11_norm_std"                      "P12_norm_std"                     
##  [25] "P13_norm_std"                      "P14_norm_log10_std"               
##  [27] "P15_norm_log10_std"                "P16_norm_log10_std"               
##  [29] "P17_norm_log10_std"                "P18_norm_log10_std"               
##  [31] "P19_norm_log10_std"                "P20_norm_log10_std"               
##  [33] "P21_norm_log10_std"                "P22_norm_log10_std"               
##  [35] "P23_norm_log10_std"                "P24_norm_log10_std"               
##  [37] "P25_norm_log10_std"                "P26_norm_log10_std"               
##  [39] "P27_norm_log10_std"                "P28_norm_log10_std"               
##  [41] "P29_norm_log10_std"                "P30_norm_std"                     
##  [43] "P31_norm_std"                      "P32_norm_std"                     
##  [45] "P33_norm_std"                      "P34_norm_std"                     
##  [47] "P35_norm_std"                      "P36_norm_std"                     
##  [49] "P37_norm_std"                      "P38_norm_std"                     
##  [51] "P39_norm_std"                      "P40_norm_std"                     
##  [53] "P41_norm_std"                      "P42_norm_std"                     
##  [55] "P43_norm_std"                      "P44_norm_std"                     
##  [57] "P45_norm_std"                      "P46_norm_log10_std"               
##  [59] "P47_norm_log10_std"                "P48_norm_std"                     
##  [61] "P49_norm_std"                      "P50_norm_log10_std"               
##  [63] "P51_norm_log10_std"                "P52_norm_log10_std"               
##  [65] "P54_norm_std"                      "P55_norm_std"                     
##  [67] "P56_norm_std"                      "P57_norm_std"                     
##  [69] "P58_norm_std"                      "P59_norm_std"                     
##  [71] "P60_norm_std"                      "P62_norm_log10_std"               
##  [73] "P64_norm_std"                      "P66_norm_std"                     
##  [75] "P128_norm_std"                     "P129_norm_std"                    
##  [77] "P130_norm_log10_std"               "P131_norm_log10_std"              
##  [79] "P132_norm_std"                     "P135_norm_log10_std"              
##  [81] "P136_norm_std"                     "P137_norm_std"                    
##  [83] "P138_norm_std"                     "P139_norm_std"                    
##  [85] "P140_norm_std"                     "ST1_norm_log10_std"               
##  [87] "ST2_norm_std"                      "ST3_norm_std"                     
##  [89] "ST4_norm_std"                      "ST5_norm_log10_std"               
##  [91] "ST6_norm_std"                      "ST7_norm_std"                     
##  [93] "ST8_norm_std"                      "ST9_norm_std"                     
##  [95] "ST10_ST11_ST12_ST13_ST14_norm_std" "A2_norm_std"                      
##  [97] "A3_norm_std"                       "A46_norm_log10_std"               
##  [99] "A47_norm_std"                      "A48_norm_log10_std"               
## [101] "PF2_norm_std"                      "PF3_norm_std"                     
## [103] "PF4_norm_std"                      "PF5_norm_log10_std"               
## [105] "PF6_norm_log10_std"                "PF7_PF8_norm_log10_std"           
## [107] "E1_norm_log10_std"                 "E3_norm_std"                      
## [109] "E4_norm_log10_std"                 "E5_norm_std"                      
## [111] "E6_norm_std"                       "E7_norm_log10_std"                
## [113] "E8_norm_log10_std"                 "E9_norm_log10_std"                
## [115] "E10_norm_log10_std"                "E11_norm_log10_std"               
## [117] "E12_norm_log10_std"                "E13_norm_log10_std"               
## [119] "E14_E15_E16_norm_log10_std"        "E17_norm_log10_std"               
## [121] "E18_norm_std"                      "E19_norm_std"                     
## [123] "E20_norm_std"                      "E21_norm_std"                     
## [125] "E22_norm_std"                      "E23_norm_log10_std"               
## [127] "E24_E25_E26_norm_std"              "E28_norm_std"                     
## [129] "E29_norm_std"                      "E30_E31_norm_log10_std"
census_data_trans_selected %>% 
  write_csv("../storage/dati-cpa_2011_all-trans-selected-v0_0_3.csv") 

Conclusions

This is a draft, the analysis is still on-going.

The notes in the Variable selection section above need to be revised and the trasnformation process updated accordingly.

Acknowledgements

This analysis uses data from ISTAT distributed under CC BY 3.0 IT (see also legal notice).